#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
// matrix
#define MASTER      0
#define Tag_Send    1
#define Tag_update  2

int main(int argc, char *argv[]) {
    int procs, rank;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &procs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (argc < 3) {
        if(rank == MASTER) {
            fprintf(stderr, "Usage: mpirun -np P %s N k\n", argv[0]);
        }
        MPI_Finalize();
        exit(1);
    }

    int N = atoi(argv[1]);
    int K = atoi(argv[2]);
    int *M = (int *)malloc(N * N * sizeof(int));
    int *base = (int *)malloc(N * N * sizeof(int));

    if(rank == MASTER) {
        for(int i = 0; i < N; i++){
            for(int j = 0; j < N; j++){
                M[i * N + j] = i + j;
                base[i * N + j] = i + j;
            }
        }
    }


    MPI_Bcast(M, N * N, MPI_INT, MASTER, MPI_COMM_WORLD);
    MPI_Bcast(base, N * N, MPI_INT, MASTER, MPI_COMM_WORLD);

    int start = rank * N / procs;
    int end = (rank + 1) * N / procs;
    if(end > N) end = N;
    int num_cols = end - start;

    int *res = (int *)malloc(N * num_cols * sizeof(int));

    for(int iter = 1; iter < K; iter++) {
        for(int i = 0; i < N * num_cols; i++) res[i] = 0;
        for(int r = 0; r < N; r++) {
            for(int j = start; j < end; j++) {
                for(int k = 0; k < N; k++) {
                    res[r * num_cols + (j - start)] += base[r * N + k] * M[k * N + j];
                }
            }
        }

        if(rank != MASTER) {
            MPI_Send(&start, 1, MPI_INT, MASTER, Tag_Send, MPI_COMM_WORLD);
            MPI_Send(&end, 1, MPI_INT, MASTER, Tag_Send, MPI_COMM_WORLD);
            MPI_Send(res, N * num_cols, MPI_INT, MASTER, Tag_Send, MPI_COMM_WORLD);

            MPI_Recv(M, N * N, MPI_INT, MASTER, Tag_update, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        } else {
            for(int r = 0; r < N; r++) {
                for(int j = start; j < end; j++) {
                    M[r * N + j] = res[r * num_cols + (j - start)];
                }
            }

            for(int proc = 1; proc < procs; proc++) {
                int startP, endP;
                MPI_Recv(&startP, 1, MPI_INT, proc, Tag_Send, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                MPI_Recv(&endP, 1, MPI_INT, proc, Tag_Send, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                int cols = endP - startP;
                int *block = (int *)malloc(N * cols * sizeof(int));
                MPI_Recv(block, N * cols, MPI_INT, proc, Tag_Send, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

                for(int r = 0; r < N; r++) {
                    for(int j = startP; j < endP; j++) {
                        M[r * N + j] = block[r * cols + (j - startP)];
                    }
                }
                free(block);
            }

            for(int proc = 1; proc < procs; proc++) {
                MPI_Send(M, N * N, MPI_INT, proc, Tag_update, MPI_COMM_WORLD);
            }
        }
    }

    if(rank == MASTER) {
        for(int i = 0; i < N; i++) {
            for(int j = 0; j < N; j++) {
                printf("%d", M[i * N + j]);
                if(j < N - 1) printf(" ");
            }
            printf("\n");
        }
    }

    free(M);
    free(base);
    free(res);
    MPI_Finalize();
    return 0;
}
